## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## corrplot 0.92 loaded
##
## Loading required package: carData
##
##
## Attaching package: 'car'
##
##
## The following object is masked from 'package:dplyr':
##
## recode
##
##
## The following object is masked from 'package:purrr':
##
## some
# Code block just get the last name
fight_data_names <- fight_data %>%
mutate(red_last_name = word(R_fighter, -1, -1),
blue_last_name = word(B_fighter, -1, -1),
fighter_1 = pmin(red_last_name, blue_last_name),
fighter_2 = pmax(red_last_name, blue_last_name)) %>%
select(date, R_fighter, B_fighter, fighter_1, fighter_2)
ggplot(ppv_data, aes(x = PPV)) +
geom_histogram(binwidth = 100000) +
xlab("PPV Value") + ylab("Frequency") + ggtitle("Histogram of PPV")
ggplot(ppv_data, aes(y = PPV)) +
geom_boxplot() +
ylab("PPV Value") + ggtitle("PPV Value Boxplot")
ggplot(ppv_data, aes(x = log(PPV))) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## EDA: PPV vs. Date
# Get day of week and month parameters
ppv_dates_data <- ppv_data %>%
mutate(day_of_week = weekdays(as.Date(ppv_data$date)),
month = as.factor(format(as.Date(ppv_data$date),"%m")),
year = as.factor(format(as.Date(ppv_data$date),"%Y")))
# Violin plots showing distribution of PPV by month of year
ggplot(ppv_dates_data, aes(x = year, y = PPV)) +
geom_violin() +
stat_summary(fun.y=mean, geom="point", color="red") +
xlab("Year") + ylab("PPV Value") + ggtitle("Distribution of PPV Values by Year")
## Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
## ℹ Please use the `fun` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(ppv_dates_data, aes(x = month, y = PPV)) +
geom_violin() +
stat_summary(fun.y=mean, geom="point", size=1, color="red") +
xlab("Month") + ylab("PPV Value") + ggtitle("Distribution of PPV Values by Month")
# Distirbution varies quite a bit between Friday and Saturday Samples
ggplot(ppv_dates_data, aes(x = day_of_week, y = PPV)) +
geom_violin() +
stat_summary(fun.y=mean, geom="point", size=1, color="red") +
xlab("Day of Week") + ylab("PPV Value") + ggtitle("Distribution of PPV Values by Day of Week")
# Scatterplot of PPV Value by Date
ggplot(ppv_dates_data, aes(x=as.Date(date), y=PPV, col=day_of_week)) + geom_point() + geom_smooth(method='lm')
## `geom_smooth()` using formula = 'y ~ x'
ggplot(ppv_dates_data, aes(x=as.Date(date), y=PPV)) + geom_point() + geom_smooth(method='lm')
## `geom_smooth()` using formula = 'y ~ x'
ppv_with_location_division <- ppv_data %>%
inner_join(fight_data_names, by = c("date", "fighter_1", "fighter_2")) %>%
inner_join(fight_data_raw, by = c("date", "R_fighter", "B_fighter")) %>%
select(date, PPV, location, Fight_type) %>%
mutate(country = sub('.*,\\s*', '', location),
isUSA = ifelse(country == "USA", 1, 0),
isNAM = ifelse(country %in% c("USA", "Canada", "Mexico"), 1, 0),
isSAM = ifelse(country %in% c("Brazil"), 1, 0),
isEUR = ifelse(country %in% c("United Kingdom", "Ireland", "Germany"), 1, 0),
isAP = ifelse(country %in% c("Australia", "United Arab Emirates", "Japan"), 1, 0),
isWomens = ifelse(grepl("Women", Fight_type, fixed=TRUE), 1, 0),
isTitle = ifelse(grepl("Title", Fight_type, fixed=TRUE), 1, 0))
ggplot(ppv_with_location_division, aes(x = country, y = PPV)) +
geom_violin() +
stat_summary(fun.y=mean, geom="point", size=1, color="red")
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
# When the main event is a women's event, the mean PPV is higher
ggplot(ppv_with_location_division, aes(x = as.factor(isWomens), y = PPV)) +
geom_violin() +
stat_summary(fun.y = mean, geom="point", size=1, color="red")
# but, all women data points happened later in the UFC data, and there was some correlation between date and PPV
ggplot(ppv_with_location_division, aes(x=as.Date(date), y=PPV, col=as.factor(isWomens))) +
geom_point() +
geom_smooth(method='lm')
## `geom_smooth()` using formula = 'y ~ x'
# When the main event is a women's event, the mean PPV is higher
ggplot(ppv_with_location_division, aes(x = as.factor(isTitle), y = PPV)) +
geom_violin() +
stat_summary(fun.y = mean, geom="point", size=1, color="red")
# but, all women data points happened later in the UFC data, and there was some correlation between date and PPV
ggplot(ppv_with_location_division, aes(x=as.Date(date), y=PPV, col=as.factor(isTitle))) +
geom_point() +
geom_smooth(method='lm')
## `geom_smooth()` using formula = 'y ~ x'
ppv_with_location_division %>%
mutate(date = as.numeric(date)) %>%
select(date, PPV, isUSA, isNAM, isSAM, isEUR, isAP, isWomens, isTitle) %>%
cor() %>%
corrplot(method='number', diag=FALSE)
# is there any correlation between the PPV data and records?
ppv_with_records <- ppv_data %>%
# this step is so we can get the full name, which is easier to use when joining to df_records
inner_join(fight_data_names, by = c("date", "fighter_1", "fighter_2")) %>%
select(date, PPV, fighter_1, fighter_2, R_fighter, B_fighter) %>%
inner_join(df_records, by = join_by(date, R_fighter == fighter)) %>%
inner_join(df_records, by = join_by(date, B_fighter == fighter), suffix = c("_R", "_B")) %>%
mutate(max_win_percent = pmax(win_percentage_R, win_percentage_B),
max_win_streak = pmax(win_streak_R, win_streak_B))
ppv_with_records$avg_win_percent = rowMeans(ppv_with_records[, c('win_percentage_R', 'win_percentage_B')])
ppv_with_records$avg_win_streak = rowMeans(ppv_with_records[, c('win_streak_R', 'win_streak_B')])
# surprisingly isn't a huge correlation between the win streak or win percentage and the PPV
ppv_with_records %>%
mutate(date_numeric = as.numeric(date)) %>%
select(PPV, date_numeric, max_win_percent, max_win_streak, avg_win_percent, avg_win_streak) %>%
cor() %>%
corrplot(method = 'number', diag=FALSE)
ppv_with_records %>%
filter(max_win_percent > 0) %>%
ggplot(aes(x = max_win_percent, y = PPV)) +
geom_point()
ppv_with_records %>%
group_by(max_win_streak) %>%
summarize(avg_PPV = mean(PPV)) %>%
ggplot(aes(x = max_win_streak, y = avg_PPV)) +
geom_col()
ppv_with_records %>%
filter(avg_win_percent > 0) %>%
ggplot(aes(x = avg_win_percent, y = PPV)) +
geom_point()
ppv_with_records %>%
group_by(avg_win_streak) %>%
summarize(avg_PPV = mean(PPV)) %>%
ggplot(aes(x = avg_win_streak, y = avg_PPV)) +
geom_col()
# In the LONG format of ppv with aggregate stats, each match date will have 2 rows, one row for each fighter. I figured this would make it easier to do the correlation plots for EDA
ppv_stats_r <- ppv_data %>%
# this step is so we can get the full name, which is easier to use when joining to agg_stats
inner_join(fight_data_names, by = c("date", "fighter_1", "fighter_2")) %>%
select(date, PPV, fighter_1, fighter_2, R_fighter, B_fighter) %>%
inner_join(agg_stats, by = join_by(date, R_fighter == fighter))
ppv_stats_b <- ppv_data %>%
# this step is so we can get the full name, which is easier to use when joining to agg_stats
inner_join(fight_data_names, by = c("date", "fighter_1", "fighter_2")) %>%
select(date, PPV, fighter_1, fighter_2, R_fighter, B_fighter) %>%
inner_join(agg_stats, by = join_by(date, B_fighter == fighter))
ppv_with_agg_stats_long <- rbind(ppv_stats_r, ppv_stats_b)
# Replace all NAs with 0 - This only happened if it was a fighter's debut
ppv_with_agg_stats_long$is_debut <- ifelse(is.na(ppv_with_agg_stats_long$KD), 1, 0)
ppv_with_agg_stats_long[is.na(ppv_with_agg_stats_long)] <- 0
ppv_with_agg_stats_long %>%
select(-c(date, fighter_1, fighter_2, R_fighter, B_fighter)) %>%
cor() %>%
corrplot(method='number', diag=FALSE)
# Scatterplot of PPV by date
ggplot(ppv_data, aes(x=date, y=PPV)) +
geom_point() +
xlab("Date") + ylab("PPV Sales") + ggtitle("PPV Sales by Date") +
theme_minimal()
# Boxplot of PPV by win Streak
ppv_with_records %>%
mutate(streak_cut = cut(round(avg_win_streak), breaks = 1:4, labels = c("0", "1", "2")),
streak_cut = ifelse(is.na(streak_cut), "3+", streak_cut)) %>%
ggplot(aes(x = streak_cut, y = PPV)) +
geom_boxplot() +
xlab("Win Streak of Main Event Players") + ylab("PPV Sales") +
ggtitle("PPV Sales by Win Streak") +
theme_minimal()
# Boxplot of PPV by Continent
ppv_with_location_division %>%
mutate(Continent = case_when(
country %in% c("USA", "Canada", "Mexico") ~ "North America",
country %in% c("Brazil") ~ "South America",
country %in% c("United Kingdom", "Ireland", "Germany") ~ "Europe",
country %in% c("Australia", "United Arab Emirates", "Japan") ~ "Asia",
.default = "should_never_get_here"
)) %>%
ggplot(aes(x = Continent, y = PPV)) +
geom_boxplot() +
ylab("PPV Sales") + ggtitle("PPV Sales by Continent") +
theme_minimal()
# Boxplot of PPV by Whether Main Event is a Title Match
ppv_with_location_division %>%
mutate(matchType = ifelse(isTitle == 1, "Title Match", "Non-Title Match")) %>%
ggplot(aes(x = matchType, y = PPV)) +
geom_boxplot() +
xlab("Match Type") + ylab("PPV Sales") +
ggtitle("PPV Sales by Whether Main is Title Match") +
theme_minimal()
From looking at the correlation matrix, it was clear that some variables are multicollinear. For example, the CLINCH_ATT (corner strikes in the clinch attempted) and CLINCH_LND (corner strikes in the clinch landed) have a correlation of 0.98. After reading some articles online, it was determined that one way to deal with this outside of removing the variable is to combine multicollinear variables into one. Thus, I decided to transform any LND and ATT variables into one percent variabble (percent = LND / ATT). After creating these variables, I removed the original LND/ATT variables. Note that some LND/ATT variables already existed
# Convert the LND and ATT statistics into percentage ones
agg_stats_pct <- agg_stats %>%
mutate(TOT_STR_pct = TOT_STR_LND / TOT_STR_ATT,
HEAD_pct = HEAD_LND / HEAD_ATT,
BODY_pct = BODY_LND / BODY_ATT,
LEG_pct = LEG_LND / LEG_ATT,
DIST_pct = DIST_LND / DIST_ATT,
CLINCH_pct = CLINCH_LND / CLINCH_ATT,
GRND_pct = GRND_LND / GRND_ATT) %>%
select(-c(SIG_STR_LND, SIG_STR_ATT, TOT_STR_LND, TOT_STR_ATT, TD_LND, TD_ATT, HEAD_LND, HEAD_ATT, BODY_LND, BODY_ATT, LEG_LND, LEG_ATT, DIST_LND, DIST_ATT,
CLINCH_LND, CLINCH_ATT, GRND_LND, GRND_ATT))
# Combine the PPV data with the aggregated stats
ppv_with_agg_stats_pct <- ppv_data %>%
# this step is so we can get the full name, which is easier to use when joining to agg_stats_pct
inner_join(fight_data_names, by = c("date", "fighter_1", "fighter_2")) %>%
select(date, PPV, fighter_1, fighter_2, R_fighter, B_fighter) %>%
inner_join(agg_stats_pct, by = join_by(date, R_fighter == fighter)) %>%
inner_join(agg_stats_pct, by = join_by(date, B_fighter == fighter), suffix = c("_R", "_B"))
# Replace all NAs with 0 - This only happened if it was a fighter's debut
ppv_with_agg_stats_pct[is.na(ppv_with_agg_stats_pct)] <- 0
ppv_with_records[is.na(ppv_with_records)] <- 0
# Combine aggregated stats, records, and PPV data
ppv_data_joined_for_lm <- ppv_with_records %>%
inner_join(ppv_with_agg_stats_pct, by = c("date", "PPV", "fighter_1", "fighter_2", "R_fighter", "B_fighter")) %>%
inner_join(ppv_with_location_division, by = c("date", "PPV")) %>%
# remove columns related to name (not needed for linear regression)
select(-c(fighter_1, fighter_2, R_fighter, B_fighter, location, country, Fight_type)) %>%
# remove columns that can be calculated via other columns (aliased columns)
select(-c(win_percentage_R, win_percentage_B, max_win_percent, max_win_streak, avg_win_percent, avg_win_streak, isAP))
# taking this from Andrew's code to get train-validation split :)
set.seed(42)
m <- nrow(ppv_data_joined_for_lm)
trn_rows <- sample(1:m, size=round(m*0.8), replace=FALSE)
d_train <- ppv_data_joined_for_lm[trn_rows,]
d_val <- ppv_data_joined_for_lm[-trn_rows,]
To figure out the best model for the feature, I decide to do the following: - Run the model on a training set (80% of the data) - Calculate the R^2 and MSE on the test set (20% of the data) Since there were lots of variables, I decided to use stepwise regression for feature selection. The total models I considered were: - Running on all predictors - Running on all predictors, but applying a log transformation on PPV - Running on predictors chosen from stepwise regression - Running on predictors chosen from stepwise regression, but applying a log transformation on PPV
col_names <- c("Model", "Train R^2", "Train MSE", "Test R^2", "Test MSE")
model_fit<- data.frame(matrix(ncol=5,nrow=0,
dimnames=list(NULL, col_names)))
fit_lm_model <- function(formula, model_name){
# Fit the model
lm_model <- lm(formula, d_train)
#print(summary(lm_model))
# Gather model fit metrics on training set
train_r_squared <- summary(lm_model)$r.squared
train_MSE <- mean(summary(lm_model)$residuals^2)
# Gather model fit metrics on validation set
val_pred <- predict(lm_model, d_val)
validate_SSE <- sum((d_val$PPV - val_pred)^2)
validate_y_bar <- mean(d_val$PPV)
validate_SST <- sum((d_val$PPV - validate_y_bar)^2)
validate_r_squared <- 1 - (validate_SSE / validate_SST)
validate_MSE <- mean((d_val$PPV - val_pred)^2)
# Add a row to model_fit with these metrics
observation <- data.frame(model_name, train_r_squared, train_MSE, validate_r_squared, validate_MSE)
names(observation) <- col_names
model_fit <<- rbind(model_fit, observation)
# rbind led to some weird row numbers, this helps remove some confusion
rownames(model_fit) <<- NULL
}
To get formulas for stepwise regression, I used the step() function. For the sake of
fit_lm_model(PPV ~ ., "All predictors")
fit_lm_model(log(PPV) ~ ., "All predictors, log(PPV)")
fit_lm_model(PPV ~ total_wins_R + total_draws_R + lose_streak_R +
total_wins_B + SIG_STR_pct_R + HEAD_pct_R + CLINCH_pct_R +
GRND_pct_R + KD_B + SUB_ATT_B + REV_B + CTRL_B + HEAD_pct_B +
CLINCH_pct_B + isNAM + isWomens + isTitle, "Stepwise Regression Predictors")
fit_lm_model(PPV ~ date + total_wins_R + total_draws_R + lose_streak_R +
total_wins_B + total_losses_B + KD_R + REV_R + CTRL_R + BODY_pct_R +
CLINCH_pct_R + TD_pct_B + SUB_ATT_B + TOT_STR_pct_B + BODY_pct_B +
CLINCH_pct_B + GRND_pct_B + isUSA + isSAM + isWomens + isTitle, "Backward Selection Predictors")
model_fit
## Model Train R^2 Train MSE Test R^2
## 1 All predictors 0.5142681 4.654322e+10 0.19464344
## 2 All predictors, log(PPV) 0.5935205 2.867805e-01 -2.07748686
## 3 Stepwise Regression Predictors 0.3730609 6.007381e+10 0.31792694
## 4 Backward Selection Predictors 0.4795769 4.986736e+10 0.01143136
## Test MSE
## 1 81114005895
## 2 309958718162
## 3 68697122485
## 4 99566783576
lm_best <- lm(PPV ~ total_wins_R + total_draws_R + lose_streak_R +
total_wins_B + SIG_STR_pct_R + HEAD_pct_R + CLINCH_pct_R +
GRND_pct_R + KD_B + SUB_ATT_B + REV_B + CTRL_B + HEAD_pct_B +
CLINCH_pct_B + isNAM + isWomens + isTitle, data = ppv_data_joined_for_lm)
summary(lm_best)
##
## Call:
## lm(formula = PPV ~ total_wins_R + total_draws_R + lose_streak_R +
## total_wins_B + SIG_STR_pct_R + HEAD_pct_R + CLINCH_pct_R +
## GRND_pct_R + KD_B + SUB_ATT_B + REV_B + CTRL_B + HEAD_pct_B +
## CLINCH_pct_B + isNAM + isWomens + isTitle, data = ppv_data_joined_for_lm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -586866 -155883 -48806 121163 852977
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -431711 170532 -2.532 0.012365 *
## total_wins_R 11721 5344 2.193 0.029805 *
## total_draws_R -140612 58924 -2.386 0.018240 *
## lose_streak_R -120605 48300 -2.497 0.013583 *
## total_wins_B 19813 5552 3.569 0.000480 ***
## SIG_STR_pct_R -1760867 489241 -3.599 0.000431 ***
## HEAD_pct_R 1163652 456488 2.549 0.011783 *
## CLINCH_pct_R 634310 234207 2.708 0.007533 **
## GRND_pct_R 556829 196981 2.827 0.005330 **
## KD_B 850486 340211 2.500 0.013478 *
## SUB_ATT_B 926572 388301 2.386 0.018245 *
## REV_B 932018 1284088 0.726 0.469057
## CTRL_B 5436 2097 2.592 0.010455 *
## HEAD_pct_B -665160 211902 -3.139 0.002035 **
## CLINCH_pct_B 321484 135821 2.367 0.019186 *
## isNAM 224366 66002 3.399 0.000861 ***
## isWomens 395466 96396 4.103 6.62e-05 ***
## isTitle -100552 56536 -1.779 0.077298 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 258000 on 153 degrees of freedom
## Multiple R-squared: 0.3857, Adjusted R-squared: 0.3175
## F-statistic: 5.652 on 17 and 153 DF, p-value: 8.545e-10
par(mfrow = c(2, 2))
plot(lm_best)
# Get correlation of variables with PPV (similar format as Alejandro's for report)
cor_mat <- ppv_data_joined_for_lm %>%
select(PPV, total_wins_R, total_draws_R, lose_streak_R, total_wins_B, SIG_STR_pct_R,
HEAD_pct_R, CLINCH_pct_R, GRND_pct_R, KD_B, SUB_ATT_B, REV_B, CTRL_B,
HEAD_pct_B, CLINCH_pct_B, isNAM, isWomens, isTitle) %>%
cor()
ppv_correlations <- cor_mat["PPV", -1]
cor_df <- data.frame(Variable = names(ppv_correlations), Correlation = ppv_correlations)
ggplot(cor_df, aes(x = reorder(Variable, Correlation), y = Correlation, fill = Correlation)) +
geom_bar(stat = "identity") +
coord_flip() + # Flip coordinates for better visualization of variable names
labs(x = "Variable", y = "Correlation with PPV") +
ggtitle("Correlation of Variables with PPV") +
theme_minimal() +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, space = "Lab", name="Correlation")